library(cluster)
## Warning: package 'cluster' was built under R version 3.4.4
library(mclust)
## Warning: package 'mclust' was built under R version 3.4.4
## Package 'mclust' version 5.4.1
## Type 'citation("mclust")' for citing this R package in publications.
library(lattice)
library(tidyverse)
## Warning: package 'tidyverse' was built under R version 3.4.2
## ── Attaching packages ────────────────────────────────────────────────────────────────────────────────────────────────────── tidyverse 1.2.1 ──
## ✔ ggplot2 2.2.1.9000     ✔ purrr   0.2.4     
## ✔ tibble  1.4.2          ✔ dplyr   0.7.5     
## ✔ tidyr   0.8.1          ✔ stringr 1.3.1     
## ✔ readr   1.1.1          ✔ forcats 0.3.0
## Warning: package 'tibble' was built under R version 3.4.3
## Warning: package 'purrr' was built under R version 3.4.2
## Warning: package 'forcats' was built under R version 3.4.3
## ── Conflicts ───────────────────────────────────────────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ✖ purrr::map()    masks mclust::map()
library(maptree)
## Loading required package: rpart
## Warning: package 'rpart' was built under R version 3.4.3
library(mice)
## Warning: package 'mice' was built under R version 3.4.4
## 
## Attaching package: 'mice'
## The following object is masked from 'package:tidyr':
## 
##     complete
## The following objects are masked from 'package:base':
## 
##     cbind, rbind
library(magrittr)
## 
## Attaching package: 'magrittr'
## The following object is masked from 'package:purrr':
## 
##     set_names
## The following object is masked from 'package:tidyr':
## 
##     extract
library(mclust)
library(mice)
library(nFactors)
## Loading required package: MASS
## 
## Attaching package: 'MASS'
## The following object is masked from 'package:dplyr':
## 
##     select
## Loading required package: psych
## 
## Attaching package: 'psych'
## The following objects are masked from 'package:ggplot2':
## 
##     %+%, alpha
## The following object is masked from 'package:mclust':
## 
##     sim
## Loading required package: boot
## Warning: package 'boot' was built under R version 3.4.1
## 
## Attaching package: 'boot'
## The following object is masked from 'package:psych':
## 
##     logit
## The following object is masked from 'package:lattice':
## 
##     melanoma
## 
## Attaching package: 'nFactors'
## The following object is masked from 'package:lattice':
## 
##     parallel
library(poLCA)
## Loading required package: scatterplot3d
## Warning: package 'scatterplot3d' was built under R version 3.4.4
library(Rtsne)
library(corrplot)
## Warning: package 'corrplot' was built under R version 3.4.2
## corrplot 0.84 loaded

Utility Functions

utility_histogramplotter <- function(x) {
  lattice::histogram( ~ x)
}
seg.summ <- function(data, groups) {
  aggregate(data, list(groups), function(x) mean(as.numeric(as.character(x))))
}
minmax <- function(x)
{
    return((x- min(x)) /(max(x)-min(x)))
}
plot_bi_plots <- function(df, y,...) {
  old.par = par()
  par(mar=c(5,4,5,2))
  on.exit(expr = 'par = old.par')
  prcomp(df) %>% biplot(col = c('gray', 'red'), cex = .8, main = y, ...)
}
adjust_q13 <- function(df){
  df %<>% 
    mutate(q13_visitfreq_social= (q13r1+q13r2+q13r3 +q13r11)/4,
           q13_visitfreq_music = (q13r4+q13r7+q13r8 +q13r9)/4,
           q13_visitfreq_video = (q13r5+q13r6+q13r10+q13r12)/4)
  df[,!str_detect(names(df),'q13r')]
}
adjust_q24 <- function(df){
  df %<>% 
    mutate(
      q24_tech_posatt = (q24r1+q24r2+q24r3+q24r5+q24r6)/5,
      q24_tech_enter = (q24r7+q24r8)/2,
      q24_tech_comm = (q24r10+q24r11+q24r12)/3,
      q24_tech_negatv = (q24r4+q24r9)/2
    )
  df[,!str_detect(names(df),'q24r')]
}
adjust_q25 <- function(df){
  df %<>% 
    mutate(
      q25_prsnlty_leader = (q25r1+q25r2+q25r3+q25r4)/4,
      q25_prsnlty_risk   = (q25r5+q25r7+q25r8)/3,
      q25_prsnlty_drive  = (q25r9+q25r10+q25r11+q25r12)/4,
      q25_prsnlty_follower = q25r6
    )
  df[,!str_detect(names(df),'q25r')]
}
adjust_q26 <- function(df){
  df %<>% 
    mutate(
      q26_shopsavvy_bargain =  (q26r3+q26r5)/2,
      q26_shopsavvy_brands = (q26r18+q26r7+q26r13+q26r14+q26r15)/5,
      q26_shopsavvy_earn2spend = (q26r13+q26r16+q26r4)/3,
      q26_shopsavvy_applover = (q26r17+q26r12+q26r10+q26r8+q26r6+q26r9)/6,
      q26_shopsavvy_children = q26r11,
    )
  df[,!str_detect(names(df),'q26r')]
}
adjust_q2 <- function(df){
  df %<>%
    mutate(
      q2_apple = ifelse(q2r1==1|q2r2==1,1,0),
      q2_andriod = q2r3,
      q2_windows = q2r6,
      q2_tablet = q2r8,
      q2_other = ifelse(q2r4==1|q2r5==1|q2r7==1|q2r9==1,1,0)
    )
  df[,!str_detect(names(df),'q2r')]
}
adjust_q4 <- function(df){
    df %<>%
    mutate(
      q4_use_music_apps =q4r1,
      q4_use_tv_apps =ifelse(q4r2==1|q4r3==1|q4r4==1,1,0),
      q4_use_game_apps =q4r5,
      q4_use_social_apps =q4r6,
      q4_use_news_apps =ifelse(q4r7==1|q4r9==1,1,0),
      q4_use_shop_apps =q4r8,
      q4_use_none_apps =q4r11
    )
  df[,!str_detect(names(df),'q4r')]
}
adjust_race <- function(df){
  df %>% 
    mutate(
      q54_white  = ifelse(q54==1,1,0),
      q54_black  = ifelse(q54==2,1,0),
      q54_asian  = ifelse(q54==3,1,0),
      q54_hawai  = ifelse(q54==4,1,0),
      q54_native =  ifelse(q54==5,1,0),
      q54_other  = ifelse(q54==6,1,0),
      q55_latino =  ifelse(q55==1,1,0)
    ) %>% 
    dplyr::select(-q54,-q55)
}
adjust_gender <- function(df){
  df %>% 
    mutate(q57 = ifelse(q57==1,1,0))
}
adjust_marital <- function(df){
  df %>% 
    mutate(q49 = ifelse(q49==1,1,0))
}
adjust_q11 <- function(df){
  #Q11 has artificial ordinality between #of apps increasing, and response==5 as "Dont know"
  #This function resolves this problem
  df %>% 
    mutate(q11 = ifelse(q11==5 | q11==6, 0, q11))
}
adjust_children <- function(df){
  df %>% 
    dplyr::select(-q50r2_inftod,-q50r3_6_12,-q50r4_13_17,-q50r5_adult)
}
adjust_income <- function(df){
  df %>% 
    dplyr::mutate(q56 = case_when(
      q56 <= 4 ~ 1,
      q56 >= 5 & q56 <=8 ~ 2,
      q56 >= 9 & q56 <=11 ~ 3,
      q56 >= 12 & q56 <= 13 ~ 4,
      q56 >= 14 ~ 5
    ))
}
adjust_age <- function(df){
  df %>% 
    dplyr::mutate(
      q1 = case_when(
        q1 <= 2 ~ 1,
        q1 >= 3 & q1 <= 5 ~ 2,
        q1 >= 6 & q1 <= 8 ~ 3,
        q1 >= 9 & q1 <= 11 ~ 4,
        q1 >= 12 ~ 5,
      )
    )
}
adjust_names <- function(df){
  df %>% 
    dplyr::rename(
      q1_age=q1,
      q11_appnum = q11,
      q12_freeapppc = q12,
      q48_edu = q48,
      q49_marital = q49,
      q56_income = q56,
      q57_mf = q57,
      q50r1_nochild = q50r1,
      q50r2_inftod = q50r2, 
      q50r3_6_12 = q50r3,
      q50r4_13_17 = q50r4,
      q50r5_adult = q50r5
    )
}

Data Prep

load('../data/apphappyData.RData')
df_labs <- tbl_df(apphappy.3.labs.frame)
df_nums <- tbl_df(apphappy.3.num.frame)
dim(df_nums)
## [1] 1800   89
rm(apphappy.3.num.frame); rm(apphappy.3.labs.frame)
df_labs$q57 <- as.factor(df_labs$q57)
df_labs$caseID <- NULL
df_nums$caseID <- NULL
df_labs$q2r10 <- NULL
df_nums$q2r10 <- NULL
df_labs$q5r1 <- NULL
df_nums$q5r1 <- NULL

Remove missing values

Handle the case of the missing apps?

df_nums$q12[is.na(df_nums$q12)] <- 0
df_nums %>% xtabs(~q12+q11,.,addNA = T)
##    q11
## q12   1   2   3   4   5   6
##   0   0   0   0   0   0  24
##   1   6   9   5   3   1   0
##   2  31  56  48  66   2   0
##   3  23  42 117 114  11   0
##   4  12  44 150 198  12   0
##   5  23  44 158 209  20   0
##   6  68  78 131  71  24   0
df_nums %>% xtabs(~q4r10+q11,.,addNA = T)
##      q11
## q4r10   1   2   3   4   5   6
##     0 156 254 564 595  67  24
##     1   7  19  45  66   3   0
df_nums %>% xtabs(~q4r11+q11,.,addNA = T)
##      q11
## q4r11   1   2   3   4   5   6
##     0 148 267 606 661  65   8
##     1  15   6   3   0   5  16
# RULE (A): IF q4r11=TRUE, it means you have no apps. So that means q11 has to equal 6, i.e. NONE. Data shows this is violated. Correcting using this rule.
df_nums$q11[df_nums$q4r11==TRUE] <- 6
df_nums %>% xtabs(~q4r11+q11,.,addNA = T)
##      q11
## q4r11   1   2   3   4   5   6
##     0 148 267 606 661  65   8
##     1   0   0   0   0   0  45

Small changes

Since q11 can be ordinal, “none” should equal 0 instead of 6 to preserve ordinality

# RULE (B): Set q11=6 to q11=0
df_nums$q11[df_nums$q11==6] <- 0
df_nums %>% xtabs(~q11,., addNA = T)
## q11
##   0   1   2   3   4   5 
##  53 148 267 606 661  65

‘Dont know’ doesn’t add value. Perhaps these can be set to NA and then imputed using mice.

# RULE (C): Set q11 Dont know to NA
df_nums$q11[df_nums$q11==0] <- NA
df_nums %>% xtabs(~q11,., addNA = T)
## q11
##    1    2    3    4    5 <NA> 
##  148  267  606  661   65   53
# RULE (D): All NA values for q12 set to 6, since I'm approximating that if you don't have any apps, might as well could as free?
df_nums$q12[is.na(df_nums$q12)] <- 6
df_nums %>% xtabs(~q12,., addNA = T)
## q12
##   0   1   2   3   4   5   6 
##  24  24 203 307 416 454 372
map_int(df_nums,~sum(is.na(.x))) %>% sort() %>% tail(3) %>% barchart(main='Missing values')

Imputing using mice

map_df(df_nums,~as.factor(.x)) %>% 
  mice::mice(printFlag = T, m = 5, method = 'rf') -> miceFit
## 
##  iter imp variable
##   1   1  q11  q57
##   1   2  q11  q57
##   1   3  q11  q57
##   1   4  q11  q57
##   1   5  q11  q57
##   2   1  q11  q57
##   2   2  q11  q57
##   2   3  q11  q57
##   2   4  q11  q57
##   2   5  q11  q57
##   3   1  q11  q57
##   3   2  q11  q57
##   3   3  q11  q57
##   3   4  q11  q57
##   3   5  q11  q57
##   4   1  q11  q57
##   4   2  q11  q57
##   4   3  q11  q57
##   4   4  q11  q57
##   4   5  q11  q57
##   5   1  q11  q57
##   5   2  q11  q57
##   5   3  q11  q57
##   5   4  q11  q57
##   5   5  q11  q57
## Warning in as.POSIXlt.POSIXct(Sys.time()): unknown timezone 'default/
## America/Indiana/Indianapolis'
## Warning: Number of logged events: 25
df_nums <- tbl_df(mice::complete(miceFit))

Bi Plots

# questions_to_consolidate <- c('q13','q24','q25','q26')
# plot_bi_plots(df_nums %>% dplyr::select(contains(questions_to_consolidate[1])),questions_to_consolidate[1])
# plot_bi_plots(df_nums %>% dplyr::select(contains(questions_to_consolidate[2])),questions_to_consolidate[2], xlim=c(0,.05), ylim=c(-0.01,0.002))
# plot_bi_plots(df_nums %>% dplyr::select(contains(questions_to_consolidate[3])),questions_to_consolidate[3])
# plot_bi_plots(df_nums %>% dplyr::select(contains(questions_to_consolidate[4])),questions_to_consolidate[4], xlim=c(0,0.05),ylim=c(-0.011,0.005))

Data Prep

Sub grouping

df_nums <- map_df(df_nums,~as.numeric(as.character(.x)))
df_nums_adj <- df_nums %>%  
  adjust_q13() %>% 
  adjust_q24() %>% 
  adjust_q25() %>% 
  adjust_q26() %>% 
  adjust_q2() %>% 
  adjust_q4() %>% 
  adjust_race() %>% 
  adjust_gender() %>% 
  adjust_marital() %>% 
  adjust_income() %>%
  adjust_age() %>% 
  adjust_q11() %>% 
  adjust_names() %>% 
  adjust_children()
## Warning: package 'bindrcpp' was built under R version 3.4.4
glimpse(df_nums_adj)
## Observations: 1,800
## Variables: 43
## $ q1_age                   <dbl> 1, 1, 3, 1, 2, 1, 2, 2, 2, 2, 2, 1, 1...
## $ q11_appnum               <dbl> 4, 3, 3, 4, 3, 4, 4, 3, 1, 2, 3, 2, 4...
## $ q12_freeapppc            <dbl> 5, 5, 6, 4, 6, 3, 3, 6, 5, 2, 5, 6, 2...
## $ q48_edu                  <dbl> 3, 4, 3, 4, 4, 1, 3, 4, 3, 3, 4, 3, 4...
## $ q49_marital              <dbl> 0, 0, 1, 0, 1, 0, 0, 0, 1, 1, 1, 0, 0...
## $ q50r1_nochild            <dbl> 1, 1, 0, 1, 0, 1, 0, 1, 0, 0, 1, 1, 1...
## $ q56_income               <dbl> 2, 3, 4, 2, 2, 1, 1, 3, 2, 2, 2, 2, 4...
## $ q57_mf                   <dbl> 1, 1, 1, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0...
## $ q13_visitfreq_social     <dbl> 2.50, 3.00, 3.25, 4.00, 2.50, 4.00, 2...
## $ q13_visitfreq_music      <dbl> 3.75, 3.50, 2.75, 3.25, 3.50, 3.50, 4...
## $ q13_visitfreq_video      <dbl> 2.50, 2.00, 2.50, 2.50, 1.75, 2.25, 3...
## $ q24_tech_posatt          <dbl> 2.8, 2.8, 3.0, 3.0, 2.2, 3.4, 2.8, 2....
## $ q24_tech_enter           <dbl> 2.0, 2.0, 1.5, 3.0, 2.5, 1.0, 2.5, 2....
## $ q24_tech_comm            <dbl> 2.666667, 1.333333, 2.666667, 3.66666...
## $ q24_tech_negatv          <dbl> 1.0, 3.0, 4.5, 3.0, 1.5, 4.5, 1.5, 4....
## $ q25_prsnlty_leader       <dbl> 1.50, 2.75, 2.50, 2.00, 3.25, 3.00, 3...
## $ q25_prsnlty_risk         <dbl> 1.666667, 2.333333, 3.333333, 2.66666...
## $ q25_prsnlty_drive        <dbl> 2.75, 2.25, 2.00, 3.25, 3.00, 2.50, 2...
## $ q25_prsnlty_follower     <dbl> 5, 5, 6, 5, 4, 4, 2, 5, 5, 2, 6, 5, 2...
## $ q26_shopsavvy_bargain    <dbl> 2.5, 3.5, 3.0, 3.0, 3.0, 3.0, 3.5, 1....
## $ q26_shopsavvy_brands     <dbl> 3.4, 3.2, 3.4, 3.0, 4.0, 3.2, 2.8, 3....
## $ q26_shopsavvy_earn2spend <dbl> 2.666667, 3.333333, 3.666667, 3.66666...
## $ q26_shopsavvy_applover   <dbl> 4.166667, 3.000000, 4.166667, 2.16666...
## $ q26_shopsavvy_children   <dbl> 6, 6, 4, 6, 3, 5, 5, 6, 5, 1, 6, 6, 1...
## $ q2_apple                 <dbl> 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 1, 0, 1...
## $ q2_andriod               <dbl> 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1...
## $ q2_windows               <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1...
## $ q2_tablet                <dbl> 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1...
## $ q2_other                 <dbl> 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 1...
## $ q4_use_music_apps        <dbl> 1, 0, 0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1...
## $ q4_use_tv_apps           <dbl> 0, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1...
## $ q4_use_game_apps         <dbl> 0, 1, 0, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1...
## $ q4_use_social_apps       <dbl> 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 0, 1, 1...
## $ q4_use_news_apps         <dbl> 0, 0, 1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1...
## $ q4_use_shop_apps         <dbl> 0, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1...
## $ q4_use_none_apps         <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0...
## $ q54_white                <dbl> 0, 1, 0, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1...
## $ q54_black                <dbl> 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0...
## $ q54_asian                <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0...
## $ q54_hawai                <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0...
## $ q54_native               <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0...
## $ q54_other                <dbl> 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0...
## $ q55_latino               <dbl> 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0...

Selection of basis variables

set1 <- c(
  'q1_age',
  # 'q11_appnum',
  # 'q12_freeapppc',
  'q48_edu',
  'q49_marital',
  'q50r1_nochild',
  'q56_income',
  # 'q57_mf',
  # 'q13_visitfreq_social',
  # 'q13_visitfreq_music',
  # 'q13_visitfreq_video',
  'q24_tech_posatt',
  'q24_tech_enter',
  'q24_tech_comm',
  'q24_tech_negatv',
  # 'q25_prsnlty_leader',
  # 'q25_prsnlty_risk',
  # 'q25_prsnlty_drive',
  # 'q25_prsnlty_follower',
  'q26_shopsavvy_bargain',
  'q26_shopsavvy_brands',
  'q26_shopsavvy_earn2spend',
  'q26_shopsavvy_applover',
  'q26_shopsavvy_children',
  'q2_apple',
  'q2_andriod',
  'q2_windows',
  # 'q2_tablet',
  # 'q2_other',
  # 'q4_use_music_apps',
  # 'q4_use_tv_apps',
  # 'q4_use_game_apps',
  # 'q4_use_social_apps',
  # 'q4_use_news_apps',
  # 'q4_use_shop_apps',
  # 'q4_use_none_apps',
  'q54_white',
  'q54_black',
  'q54_asian',
  'q54_hawai',
  'q54_native',
  # 'q54_other',
  'q55_latino'
  )
set2 <- c(
  'q1_age',
  # 'q11_appnum',
  'q12_freeapppc',
  'q48_edu',
  'q49_marital',
  'q50r1_nochild',
  'q56_income',
  # 'q57_mf',
  'q13_visitfreq_social',
  'q13_visitfreq_music',
  'q13_visitfreq_video',
  'q24_tech_posatt',
  'q24_tech_enter',
  'q24_tech_comm',
  'q24_tech_negatv',
  'q25_prsnlty_leader',
  'q25_prsnlty_risk',
  'q25_prsnlty_drive',
  'q25_prsnlty_follower',
  'q26_shopsavvy_bargain',
  'q26_shopsavvy_brands',
  'q26_shopsavvy_earn2spend',
  'q26_shopsavvy_applover',
  'q26_shopsavvy_children',
  # 'q2_apple',
  # 'q2_andriod',
  # 'q2_windows',
  # 'q2_tablet',
  # 'q2_other',
  # 'q4_use_music_apps',
  # 'q4_use_tv_apps',
  # 'q4_use_game_apps',
  # 'q4_use_social_apps',
  # 'q4_use_news_apps',
  # 'q4_use_shop_apps',
  # 'q4_use_none_apps',
  'q54_white',
  'q54_black',
  'q54_asian',
  'q54_hawai',
  'q54_native',
  # 'q54_other',
  'q55_latino'
  )
set3 <- c(
  'q1_age',
  'q11_appnum',
  'q12_freeapppc',
  'q48_edu',
  'q49_marital',
  'q50r1_nochild',
  'q56_income',
  'q57_mf',
  # 'q13_visitfreq_social',
  # 'q13_visitfreq_music',
  # 'q13_visitfreq_video',
  # 'q24_tech_posatt',
  # 'q24_tech_enter',
  # 'q24_tech_comm',
  # 'q24_tech_negatv',
  # 'q25_prsnlty_leader',
  # 'q25_prsnlty_risk',
  # 'q25_prsnlty_drive',
  # 'q25_prsnlty_follower',
  'q26_shopsavvy_bargain',
  'q26_shopsavvy_brands',
  'q26_shopsavvy_earn2spend',
  'q26_shopsavvy_applover',
  'q26_shopsavvy_children',
  'q2_apple',
  'q2_andriod',
  'q2_windows',
  'q2_tablet',
  'q2_other',
  'q4_use_music_apps',
  'q4_use_tv_apps',
  'q4_use_game_apps',
  'q4_use_social_apps',
  'q4_use_news_apps',
  'q4_use_shop_apps',
  'q4_use_none_apps'
  # 'q54_white',
  # 'q54_black',
  # 'q54_asian',
  # 'q54_hawai',
  # 'q54_native',
  # 'q54_other',
  # 'q55_latino'
  )
set4 <- c(
  'q1_age',
  # 'q11_appnum',
  # 'q12_freeapppc',
  # 'q48_edu',
  'q49_marital',
  'q50r1_nochild',
  'q56_income',
  'q57_mf',
  'q13_visitfreq_social',
  'q13_visitfreq_music',
  'q13_visitfreq_video',
  'q24_tech_posatt',
  'q24_tech_enter',
  'q24_tech_comm',
  'q24_tech_negatv',
  'q25_prsnlty_leader',
  'q25_prsnlty_risk',
  'q25_prsnlty_drive',
  'q25_prsnlty_follower',
  'q26_shopsavvy_bargain',
  'q26_shopsavvy_brands',
  'q26_shopsavvy_earn2spend',
  'q26_shopsavvy_applover',
  'q26_shopsavvy_children'
  # 'q2_apple',
  # 'q2_andriod',
  # 'q2_windows',
  # 'q2_tablet',
  # 'q2_other',
  # 'q4_use_music_apps',
  # 'q4_use_tv_apps',
  # 'q4_use_game_apps',
  # 'q4_use_social_apps',
  # 'q4_use_news_apps',
  # 'q4_use_shop_apps',
  # 'q4_use_none_apps'
  # 'q54_white',
  # 'q54_black',
  # 'q54_asian',
  # 'q54_hawai',
  # 'q54_native',
  # 'q54_other',
  # 'q55_latino'
  )
df_nums_adj_backup <- df_nums_adj
df_nums_adj <- df_nums_adj_backup
df_nums_adj <- df_nums_adj %>% dplyr::select(one_of(set4))

Scaling

scaling_wanted <- T
minmax_wanted <- F
if (scaling_wanted) {
  df_nums_adjscaled <- scale(df_nums_adj)
  centers <- attributes(df_nums_adjscaled)[[3]]
  spreads <- attributes(df_nums_adjscaled)[[4]]
  df_nums_adjscaled <- tbl_df(df_nums_adjscaled)
}
if(minmax_wanted){
  df_nums_adjscaled <- map_df(df_nums_adj, ~minmax(.x))
}
if(!scaling_wanted & !minmax_wanted){
  df_nums_adjscaled <- df_nums_adj
}
head(df_nums_adjscaled)

Correlation Plot

df_nums_adjscaled %>% cor() %>% corrplot(method = 'shade',tl.col = 'black',tl.cex = .9, order = 'hclust', hclust.method = 'ward.D2')

Dissimilarity Calculations

# ordered=c(1:4,11,13:28)
# symm_bin=c(6:10,12,29:47)
# df_nums_adjscaled[ordered] <- map_df(df_nums_adjscaled[ordered],~as.ordered(.x))
# df_nums_adjscaled[symm_bin] <- map_df(df_nums_adjscaled[symm_bin],~as.factor(.x))
distMat <- daisy(df_nums_adjscaled)
## Warning in daisy(df_nums_adjscaled): binary variable(s) 2, 3, 5 treated as
## interval scaled

Simple clustering

H clust

How cophenetic changes with clustering method?

methods <- c('complete','average','ward.D','ward.D2','median','mcquitty','centroid')
method_vs_cop <- map_dbl(methods,~cor(cophenetic(hclust(d = distMat, method = .x)), distMat))
names(method_vs_cop) <- methods
barchart(sort(method_vs_cop))

k = 6
hclustFit <- hclust(d = distMat, method = 'ward.D')
plot(hclustFit, labels = F)
rect.hclust(hclustFit, k=k, border="red")

hclust_segments <- cutree(hclustFit, k = k)
table(hclust_segments)
## hclust_segments
##   1   2   3   4   5   6 
## 302 432 154 380 164 368
clusplot(df_nums_adjscaled, hclust_segments, color=TRUE, shade=TRUE, labels=4, lines=0, main="HClust plot")

seg.summ(df_nums_adj, hclust_segments) -> centroids
centroids
cutFit <- cutree(hclustFit, k)
plot(silhouette(cutFit,distMat))

k <- 2:10
widths <- NULL
for(k_ in k){
  hclustFit <- hclust(d = distMat, method = 'ward.D')
  cutFit <- cutree(hclustFit, k_)
  widths <- c(widths,mean(silhouette(cutFit,distMat)[,'sil_width']))
}
plot(k,widths,type='b')

cbind(k,widths)
##        k     widths
##  [1,]  2 0.15040129
##  [2,]  3 0.08247704
##  [3,]  4 0.08266900
##  [4,]  5 0.06700428
##  [5,]  6 0.05121959
##  [6,]  7 0.05488371
##  [7,]  8 0.04953243
##  [8,]  9 0.05085210
##  [9,] 10 0.04773323

K-means

How many clusters to use?

wssplot <- function(numsub, nc=15, seed=1111) {
  wss <- (nrow(numsub)-1)*sum(apply(numsub,2,var))
  for (i in 2:nc) {
    set.seed(seed)
    wss[i] <- sum(kmeans(numsub, centers=i, iter.max = 1e4)$withinss)}
  plot(1:nc, wss, type="b", xlab="Number of Clusters",
       ylab="Within groups sum of squares")}
wssplot(df_nums_adjscaled)

Solving the k-means model

k <- 2:10
widths <- NULL
for(k_ in k){
  clusterresults <- kmeans(x = df_nums_adjscaled, centers = k_, nstart = 30)
  widths <- c(widths,mean(silhouette(clusterresults$cluster,distMat)[,'sil_width']))
}
plot(k,widths,type='b')

cbind(k,widths)
##        k     widths
##  [1,]  2 0.15633032
##  [2,]  3 0.10924977
##  [3,]  4 0.10559810
##  [4,]  5 0.10292595
##  [5,]  6 0.09512481
##  [6,]  7 0.09275647
##  [7,]  8 0.08705368
##  [8,]  9 0.08322454
##  [9,] 10 0.08221007
# Using cluster centers from hclust:
seg.summ(df_nums_adjscaled, hclust_segments) -> kmeans_centroids
clusterresults <- kmeans(x = df_nums_adjscaled, centers = kmeans_centroids[,-1])
rsquare <- clusterresults$betweenss/clusterresults$totss
cat('\nWithin SS:',clusterresults$withinss,' Sizes:\n',clusterresults$size,'\nrsq:', rsquare)
## 
## Within SS: 3432.662 5125.88 3631.431 5046.119 3485.899 3946.495  Sizes:
##  253 392 214 395 257 289 
## rsq: 0.3470318
clusplot(df_nums_adjscaled, clusterresults$cluster, color=TRUE, shade=TRUE, labels=4, lines=0, main="K-means cluster plot")

plot(silhouette(clusterresults$cluster,distMat))

# Using number of clusters + random starts
k_kmeans = 6
clusterresults <- kmeans(x = df_nums_adjscaled, centers = k_kmeans)
rsquare <- clusterresults$betweenss/clusterresults$totss
cat('\nWithin SS:',clusterresults$withinss,' Sizes:\n',clusterresults$size,'\nrsq:', rsquare)
## 
## Within SS: 4401.516 2878.69 5051.376 3682.317 4713.007 4087.528  Sizes:
##  341 222 380 270 323 264 
## rsq: 0.3431686
clusplot(df_nums_adjscaled, clusterresults$cluster, color=TRUE, shade=TRUE, labels=4, lines=0, main="K-means cluster plot")

plot(silhouette(clusterresults$cluster,distMat))

seg.summ(df_nums_adj, clusterresults$cluster) -> segsummary_results
segsummary_results %>% write_csv(path = '../reports/kmeans_results.csv', col_names = T)
segsummary_results
kmeans_results <- df_nums_adj %>% 
  mutate(cluster = clusterresults$cluster)

purrr::map2(kmeans_results, names(kmeans_results),
            ~bwplot(cluster~.x, kmeans_results,
                    main=.y))
## $q1_age

## 
## $q49_marital

## 
## $q50r1_nochild

## 
## $q56_income

## 
## $q57_mf

## 
## $q13_visitfreq_social

## 
## $q13_visitfreq_music

## 
## $q13_visitfreq_video

## 
## $q24_tech_posatt

## 
## $q24_tech_enter

## 
## $q24_tech_comm

## 
## $q24_tech_negatv

## 
## $q25_prsnlty_leader

## 
## $q25_prsnlty_risk

## 
## $q25_prsnlty_drive

## 
## $q25_prsnlty_follower

## 
## $q26_shopsavvy_bargain

## 
## $q26_shopsavvy_brands

## 
## $q26_shopsavvy_earn2spend

## 
## $q26_shopsavvy_applover

## 
## $q26_shopsavvy_children

## 
## $cluster

PAM

pamFits <- tibble(k_pam = 2:10)
pamFits$pamFits <- map(pamFits$k_pam, ~pam(df_nums_adjscaled, k = .x))
pamFits$sil_avg_width <- map_dbl(pamFits$pamFits,~.x$silinfo$avg.width)
pamFits
pamFits %>% 
  xyplot(sil_avg_width~as.factor(k_pam),.,type='b')

plot(pamFits$pamFits[[1]])

seg.summ(df_nums_adj, pamFits$pamFits[[1]]$clustering)

Model based

mclustFits <- tibble(mclust_clusters=2:8)
mclustFits$mclustFits <- map(mclustFits$mclust_clusters, ~Mclust(df_nums_adjscaled, G = .x))
mclustFits$bic <- map_dbl(mclustFits$mclustFits, ~.x[['bic']])
mclustFits$loglik <- map_dbl(mclustFits$mclustFits, ~.x[['loglik']])
mclustFits
mclustFits %>% 
  xyplot(bic+loglik~as.factor(mclust_clusters),., auto.key = T, type = 'b')

summary(mclustFits[[4,'mclustFits']])
## ---------------------------------------------------- 
## Gaussian finite mixture model fitted by EM algorithm 
## ---------------------------------------------------- 
## 
## Mclust VII (spherical, varying volume) model with 5 components: 
## 
##  log.likelihood    n  df       BIC      ICL
##       -48849.07 1800 114 -98552.63 -98971.1
## 
## Clustering table:
##   1   2   3   4   5 
## 522 273 313 317 375
mclust_clusters <- mclustFits[[4,'mclustFits']]$classification
clusplot(df_nums_adj, mclust_clusters, color=TRUE, shade=TRUE, labels=4, lines=0, main="M Clust Plot, k = 4")

seg.summ(df_nums_adj, mclust_clusters)
mclust_distMat <- daisy(df_nums_adj)
## Warning in daisy(df_nums_adj): binary variable(s) 2, 3, 5 treated as
## interval scaled
plot(silhouette(mclust_clusters,mclust_distMat))